base R has aweful plot, don’t use it, on the right hand is the ggplot

has legend and axis are well labeled

ggplot will be the function to use all the time for now

2013-2014 came out ggplot,

ggplot2/latice is another graphic function

tidy data is very important!!

grammer of graphics (gg)

starting with dataset, features of graphic (axis, color) that you care and map to variables of data, geom to show

panels … be consistent …

# national geographic and weather service data
# data are publically available
# download data from three stations
# daily weather information of the three stations
weather_df = 
  rnoaa::meteo_pull_monitors(c("USW00094728", "USC00519397", "USS0023B17S"),
                      var = c("PRCP", "TMIN", "TMAX"), 
                      date_min = "2017-01-01",
                      date_max = "2017-12-31") %>%
  mutate(
    name = recode(id, USW00094728 = "CentralPark_NY", 
                      USC00519397 = "Waikiki_HA",
                      USS0023B17S = "Waterhole_WA"),
    # .1 celc has to devided by 10
    tmin = tmin / 10,
    tmax = tmax / 10) %>%
  select(name, id, everything())
## Registered S3 method overwritten by 'crul':
##   method                 from
##   as.character.form_file httr
## Registered S3 method overwritten by 'hoardr':
##   method           from
##   print.cache_info httr
## file path:          /Users/macbook/Library/Caches/rnoaa/ghcnd/USW00094728.dly
## file last updated:  2019-09-04 21:33:58
## file min/max dates: 1869-01-01 / 2019-09-30
## file path:          /Users/macbook/Library/Caches/rnoaa/ghcnd/USC00519397.dly
## file last updated:  2019-09-04 21:34:09
## file min/max dates: 1965-01-01 / 2019-09-30
## file path:          /Users/macbook/Library/Caches/rnoaa/ghcnd/USS0023B17S.dly
## file last updated:  2019-09-04 21:34:13
## file min/max dates: 1999-09-01 / 2019-09-30
weather_df
## # A tibble: 1,095 x 6
##    name           id          date        prcp  tmax  tmin
##    <chr>          <chr>       <date>     <dbl> <dbl> <dbl>
##  1 CentralPark_NY USW00094728 2017-01-01     0   8.9   4.4
##  2 CentralPark_NY USW00094728 2017-01-02    53   5     2.8
##  3 CentralPark_NY USW00094728 2017-01-03   147   6.1   3.9
##  4 CentralPark_NY USW00094728 2017-01-04     0  11.1   1.1
##  5 CentralPark_NY USW00094728 2017-01-05     0   1.1  -2.7
##  6 CentralPark_NY USW00094728 2017-01-06    13   0.6  -3.8
##  7 CentralPark_NY USW00094728 2017-01-07    81  -3.2  -6.6
##  8 CentralPark_NY USW00094728 2017-01-08     0  -3.8  -8.8
##  9 CentralPark_NY USW00094728 2017-01-09     0  -4.9  -9.9
## 10 CentralPark_NY USW00094728 2017-01-10     0   7.8  -6  
## # … with 1,085 more rows

create a ggplot

ggplot(weather_df, aes(x = tmin, y = tmax))

ggplot(weather_df, aes(x = tmin, y = tmax)) + 
  geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

# alternative way of making this plot
# default is printing the plot
# Most time not using the style
#weather_df %>% filter(name =="CentralPark_NY")
#scaterplot = weather_df %>% 
#  ggplot(aes(x = timin, y = tmax)) + geom_point()
#scaterplot
#weather_df %>%
#  ggplot(aes(x = tmin, y = tmax)) + 
#  geom_point()

plot_weather = 
  weather_df %>%
  ggplot(aes(x = tmin, y = tmax)) 

plot_weather + geom_point()
## Warning: Removed 15 rows containing missing values (geom_point).

ggplot(weather_df, aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name))
## Warning: Removed 15 rows containing missing values (geom_point).

ggplot(weather_df, aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) +
  # se: grey area not certain area, not very useful to set as true
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

# facet
# do not send multiple plots.
ggplot(weather_df, aes(x = tmin, y = tmax, color = name)) + 
  geom_point(alpha = .5) +
  geom_smooth(se = FALSE) + 
  facet_grid(. ~ name)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

# this is fine, but not very interesting
# color = name is global
ggplot(weather_df, aes(x = date, y = tmax, color = name)) + 
  #size = prcp, put bubbles accounts for prcp amont
  geom_point(aes(size = prcp), alpha = .5) +
  geom_smooth(se = FALSE) + 
  facet_grid(. ~ name)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).

# alpha is the transparent level. .1 is not at all
weather_df %>% 
  filter(name == "CentralPark_NY") %>% 
  mutate(tmax_fahr = tmax * (9 / 5) + 32,
         tmin_fahr = tmin * (9 / 5) + 32) %>% 
  ggplot(aes(x = tmin_fahr, y = tmax_fahr)) +
  geom_point(alpha = .5) + 
  geom_smooth(method = "lm", se = FALSE)

why do ‘aes’ positions mater? se = false, no CIs

define color at x, y level or geom level makes the plots looks different

at geom level, the smooth color has only one color

outside geom, it applies smooth line colors according to the names (three different smooth lines)

can modify the graphics features quickly with little modification.

some extra stuff

ggplot(weather_df, aes(x = date, y = tmax, color = name)) + 
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).

# 2d density
# with each location, how many data points in each location
# when you want to show counts in each point, use color to show density
# geom_bin2d() does the samething has geom_hex()
ggplot(weather_df, aes(x = tmax, y = tmin)) + 
  geom_hex()
## Warning: Removed 15 rows containing non-finite values (stat_binhex).

ggplot(weather_df) + geom_point(aes(x = tmax, y = tmin, color = "blue"))
## Warning: Removed 15 rows containing missing values (geom_point).

ggplot(weather_df) + geom_point(aes(x = tmax, y = tmin, color = "blue"))
## Warning: Removed 15 rows containing missing values (geom_point).

ggplot(weather_df, aes(x = tmax)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).

# use fill instead of color for each bar
# position makes the bars next to each other, side by side comparisons
# histograms does not need any y axis to be defined
ggplot(weather_df, aes(x = tmax, fill = name)) + 
  geom_histogram(position = "dodge", binwidth = 2) 
## Warning: Removed 3 rows containing non-finite values (stat_bin).

# why three panels did not work?
   geom_hex()
## geom_hex: na.rm = FALSE
## stat_binhex: na.rm = FALSE
## position_identity
# Jeff's favorite histograms
ggplot(weather_df, aes(x = tmax, fill = name)) + 
  geom_density(alpha = .4, adjust = .5, color = "blue") 
## Warning: Removed 3 rows containing non-finite values (stat_density).

ggplot(weather_df, aes(x = name, y = tmax)) + geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

# violine plots occational useful only if there some too many groups to make the comparisons, can viz it easily.

ggplot(weather_df, aes(x = name, y = tmax)) + 
  geom_violin(aes(fill = name), color = "blue", alpha = .5) + 
  stat_summary(fun.y = median, geom = "point", color = "blue", size = 4)
## Warning: Removed 3 rows containing non-finite values (stat_ydensity).
## Warning: Removed 3 rows containing non-finite values (stat_summary).

# Jeff likes ridges plots
# each of the densities of each location, gives you advantages of looking at each density easily, a tall ridge plot to identify bimodelity to do comparisons among groups (website)
ggplot(weather_df, aes(x = tmax, y = name)) + 
  geom_density_ridges(scale = .85)
## Picking joint bandwidth of 1.84
## Warning: Removed 3 rows containing non-finite values (stat_density_ridges).

## piping will make the plots using the same dataset over and over again. ## se = FAUSE use standard errors as CI might be confusing ## connect ggplot segments using +, with piping use %>%

saving a plot

ggp_ridge_temp = 
  weather_df %>% 
  ggplot(aes(x = tmax, y = name)) + 
  geom_density_ridges(scale = .85)
ggsave("ggplot_temp_ridge.pdf", ggp_ridge_temp)
## Saving 12 x 5 in image
## Picking joint bandwidth of 1.84
## Warning: Removed 3 rows containing non-finite values (stat_density_ridges).

embedding the plots inside

controlling figure width

need debug this code chunk

can setup width and height

ggp_ridge_temp = 
  weather_df %>% 
  ggplot(aes(x = tmax, y = name)) + 
  geom_density_ridges(scale = .85)
ggsave("ggplot_temp_ridge.pdf", ggp_ridge_temp)
## Saving 12 x 5 in image
## Picking joint bandwidth of 1.84
## Warning: Removed 3 rows containing non-finite values (stat_density_ridges).

class stops here

codes below will be discussed next class

ggplot(weather_df, aes(x = prcp)) + 
  geom_density(aes(fill = name), alpha = .5) 
## Warning: Removed 3 rows containing non-finite values (stat_density).

ggplot(weather_df, aes(x = prcp, y = name)) + 
  geom_density_ridges(scale = .85)
## Picking joint bandwidth of 4.61
## Warning: Removed 3 rows containing non-finite values (stat_density_ridges).

ggplot(weather_df, aes(y = prcp, x = name)) + 
  geom_boxplot() 
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

weather_df %>% 
  filter(prcp > 0) %>% 
  ggplot(aes(x = prcp, y = name)) + 
  geom_density_ridges(scale = .85)
## Picking joint bandwidth of 19.7

weather_plot = ggplot(weather_df, aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name), alpha = .5) 

ggsave("weather_plot.pdf", weather_plot, width = 8, height = 5)
## Warning: Removed 15 rows containing missing values (geom_point).
knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)

ggplot(weather_df, aes(x = tmin, y = tmax)) + 
  geom_point(aes(color = name))
## Warning: Removed 15 rows containing missing values (geom_point).

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.